library(tidyverse)
names <- read_csv(here::here("Labs", "Lab 9", "StateNames_A.csv"))Challenge 9: Baby Names
Parts 1 & 2: The Data
Part 3: Summarizing & Visualizing the Number of Allisons
Question 1
# Rename column as Sex
names <- rename(names, "Sex" = "Gender")Challenge Part 2 Section 2
library(DT)datatable(names)allison_names <- names |>
group_by(State, Sex) |>
filter(Name == "Allison") |>
summarize(Count = sum(Count)) |>
pivot_wider(names_from = Sex,
values_from = Count) |>
mutate(
M = coalesce(M, 0)
)
allison_names# A tibble: 51 × 3
# Groups: State [51]
State F M
<chr> <dbl> <dbl>
1 AK 232 0
2 AL 1535 0
3 AR 1198 0
4 AZ 1880 0
5 CA 12413 0
6 CO 1594 0
7 CT 1099 0
8 DC 321 0
9 DE 294 0
10 FL 4455 0
# … with 41 more rows
Question 2
allison_F_names <- names |>
filter(Name == "Allison",
Sex == "F")Question 3
# Create data frame that can be used in future questions
names_by_year <- allison_F_names |>
group_by(Year) |>
summarize(Count = sum(Count))
names_by_year |>
ggplot(mapping = aes(x = Year,
y = Count)) +
geom_col() +
labs(title = "Number of people assigned female at birth with the name Allison")
Part 4: Modeling the Number of Allisons
Question 4
allison_lm <- names_by_year |>
lm(Count ~ Year, data = _)
allison_lm
Call:
lm(formula = Count ~ Year, data = names_by_year)
Coefficients:
(Intercept) Year
209689.8 -101.5
Question 5
names_by_year |>
ggplot(aes(x = Year,
y = Count)) +
geom_point() +
stat_smooth(method = "lm")`geom_smooth()` using formula 'y ~ x'

Question 6
Estimated Regression Equation:
Count(hat) = 209689.8 - 101.5*Year
Question 7
allison_lm |>
broom::augment() |>
ggplot(mapping = aes(x = .resid,
y = Count)) +
geom_point() +
labs(x = "Residuals")
It looks like there is a pattern with the residuals in that the residual value gets higher as the number of “Allisons” per year gets higher. This is not ideal because that means that the residuals are not random and do not necessarily satisfy linear model conditions
Question 8
From this model, we can conclude that the name Allison is becoming less and less popular. Maybe in terms of how often the name is used it’s less “cool”, but that’s not necessarily true!
Part 5: Spelling by State
Question 1
allan_names <- names |>
filter(Name == "Allan" | Name == "Alan" | Name == "Allen",
Sex == "M")Question 2
statenames <- allan_names |>
pivot_wider(names_from = Name, values_from = Count) |>
filter(Year == 2000,
State == "CA" | State == "PA") |>
select(State, Allen, Alan, Allan) |>
mutate(
Allen = coalesce(Allen, 0),
Alan = coalesce(Alan, 0),
Allan = coalesce(Allan, 0)
)
statenames# A tibble: 2 × 4
State Allen Alan Allan
<chr> <dbl> <dbl> <dbl>
1 CA 176 579 131
2 PA 56 51 12
Question 3
percents <- statenames |>
rowwise()|>
mutate(
total_sum = sum(c_across(Allen : Allan)),
Allen = Allen / total_sum,
Alan = Alan / total_sum,
Allan = Allan / total_sum
) |>
select(State:Allan)
percents# A tibble: 2 × 4
# Rowwise:
State Allen Alan Allan
<chr> <dbl> <dbl> <dbl>
1 CA 0.199 0.653 0.148
2 PA 0.471 0.429 0.101
Challenge: Creating Nice Tables
Part 1 and Step 1 of Part 2
library(kableExtra)# Table of Allisons by State
knitr::kable(allison_names,
format = "html",
col.names = c("State",
"Sex assigned at Birth",
"Number of People with Name Allison"),
booktabs = TRUE) |>
kable_styling(latex_options = "scale_down")| State | Sex assigned at Birth | Number of People with Name Allison |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
# Allan names raw counts
knitr::kable(statenames,
format = "html",
caption = "Number of 'Alans' by State",
booktabs = TRUE) |>
add_header_above(c("Location" = 1, "Name" = 3))| State | Allen | Alan | Allan |
|---|---|---|---|
| CA | 176 | 579 | 131 |
| PA | 56 | 51 | 12 |
# Allan names by percent
knitr::kable(percents,
format = "html",
caption = "Percentage of 'Alans' by State",
booktabs = TRUE) |>
kable_styling(font_size = 18)| State | Allen | Alan | Allan |
|---|---|---|---|
| CA | 0.1986456 | 0.6534989 | 0.1478555 |
| PA | 0.4705882 | 0.4285714 | 0.1008403 |